library(ggplot2)
library(corrplot)
library(plotly)
library(gapminder)
library(dplyr)
library(randomForest)
# Importing the data
nyc_path = "clean_datasets/NYC_CLEAN.csv"
boston_path_clean = "clean_datasets/BOSTON_CLEAN.csv"
boston_path = "clean_datasets/boston_data.csv"
nyc_dataset <- read.csv(nyc_path, header = TRUE)
nyc_dataset <- nyc_dataset[1:3000,]
glimpse(nyc_dataset)
Observations: 3,000
Variables: 16
$ X [3m[90m<int>[39m[23m 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20...
$ id [3m[90m<int>[39m[23m 2539, 2595, 3647, 3831, 5022, 5099, 5121, 5178, 5203, 5238, 5295, 5441, ...
$ host_id [3m[90m<int>[39m[23m 2787, 2845, 4632, 4869, 7192, 7322, 7356, 8967, 7490, 7549, 7702, 7989, ...
$ neighbourhood_group [3m[90m<fct>[39m[23m Brooklyn, Manhattan, Manhattan, Brooklyn, Manhattan, Manhattan, Brooklyn...
$ neighbourhood [3m[90m<fct>[39m[23m Kensington, Midtown, Harlem, Clinton Hill, East Harlem, Murray Hill, Bed...
$ latitude [3m[90m<dbl>[39m[23m 40.64749, 40.75362, 40.80902, 40.68514, 40.79851, 40.74767, 40.68688, 40...
$ longitude [3m[90m<dbl>[39m[23m -73.97237, -73.98377, -73.94190, -73.95976, -73.94399, -73.97500, -73.95...
$ room_type [3m[90m<fct>[39m[23m Private room, Entire home/apt, Private room, Entire home/apt, Entire hom...
$ price [3m[90m<int>[39m[23m 149, 225, 150, 89, 80, 200, 60, 79, 79, 150, 135, 85, 89, 85, 120, 140, ...
$ minimum_nights [3m[90m<int>[39m[23m 1, 1, 3, 1, 10, 3, 45, 2, 2, 1, 5, 2, 4, 2, 90, 2, 2, 1, 3, 7, 3, 2, 1, ...
$ number_of_reviews [3m[90m<int>[39m[23m 9, 45, 0, 270, 9, 74, 49, 430, 118, 160, 53, 188, 167, 113, 27, 148, 198...
$ calculated_host_listings_count [3m[90m<int>[39m[23m 6, 2, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 3, 1, 1, 1, 1, 1, 1, 2, 1, 6, 6, 6, ...
$ availability_365 [3m[90m<int>[39m[23m 365, 355, 365, 194, 0, 129, 0, 220, 0, 188, 6, 39, 314, 333, 0, 46, 321,...
$ room_type_cat [3m[90m<int>[39m[23m 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, ...
$ neighbourhood_cat [3m[90m<int>[39m[23m 108, 127, 94, 41, 61, 137, 13, 95, 202, 35, 202, 95, 182, 202, 209, 214,...
$ neighbourhood_group_cat [3m[90m<int>[39m[23m 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, ...
boston_dataset <- read.csv(boston_path_clean, header = TRUE)
boston_dataset_numerical <- read.csv(boston_path, header = TRUE)
# Room type counts
all <- rbind(data.frame(fill = "NYC", room_types= nyc_dataset$room_type),
data.frame(fill = "Boston", room_types= boston_dataset$room_type))
ggplot(nyc_dataset, aes(x = room_type, fill = room_type)) +
geom_bar() + ggtitle('Room count NYC listing')
ggplot(boston_dataset, aes(x = room_type, fill = room_type)) +
geom_bar() + ggtitle('Room count Boston listing')
ggplot(all, aes(x = room_types, fill = fill)) +
geom_bar() + ggtitle('Room count Boston listing')
ggplot(nyc_dataset, aes(x=latitude, y=longitude, color=room_type)) +
geom_point() + ggtitle('Map/room type NYC listing') +
scale_color_brewer(palette = "Dark2")
ggplot(boston_dataset, aes(x=latitude, y=longitude, color=room_type)) +
geom_point() + ggtitle('Map/room type Boston listing') +
scale_color_brewer(palette = "Dark2")
ggplot(nyc_dataset, aes(x=neighbourhood_group, fill=room_type)) +
geom_bar() + ggtitle("Neighbourhood Group with Room Type") +
scale_color_brewer(palette = "Dark2")
# Price - NYC
nyc_dataset[which(nyc_dataset$price < 300),]
price_nyc <- nyc_dataset[nyc_dataset$price < 300,]
ggplot(price_nyc, aes(x=room_type, y=price)) +
geom_boxplot() + ggtitle('Price per room type NYC')
ggplot(nyc_dataset, aes(x=latitude, y = longitude)) +
geom_point() + ggtitle('Map of the NYC listings')
ggplot(price_nyc, aes(x=latitude, y=longitude, color=price)) +
geom_point() + ggtitle('Map by price NYC listing')
# Number of reviews per price - NYC
ggplot(price_nyc, aes(x=number_of_reviews, y=price, color=neighbourhood_group)) +
geom_point(alpha=.5) +
scale_color_brewer(palette = 'Dark2')
min_night_vs_price <- subset(nyc_dataset, minimum_nights < 30 & price < 300)
ggplot(min_night_vs_price, aes(x=minimum_nights, y=price, color=room_type)) +
geom_point(alpha=.1) +
scale_color_brewer(palette = 'Dark2')
# Cleveland plot - NYC
ggplot(price_nyc, aes(x = price, y = neighbourhood_group)) +
geom_segment(aes(yend = neighbourhood_group), xend = 0, colour = "grey50") +
geom_point(size = 3, aes(colour = room_type)) +
scale_colour_brewer(palette = "Set1", limits = c("Private room", "Entire home/apt", "Shared room")) +
theme_bw() +
theme(
panel.grid.major.y = element_blank(),
legend.position = c(1, 0.55),
legend.justification = c(1, 0.5)
)
ggplot(price_nyc, aes(x = number_of_reviews, y = neighbourhood_group)) +
geom_segment(aes(yend = neighbourhood_group), xend = 0, colour = "grey50") +
geom_point(size = 3, aes(colour = room_type)) +
scale_colour_brewer(palette = "Set1", limits = c("Private room", "Entire home/apt", "Shared room")) +
theme_bw() +
theme(
panel.grid.major.y = element_blank(),
legend.position = c(1, 0.55),
legend.justification = c(1, 0.5)
)
ggplot(price_nyc, aes(x = reorder(neighbourhood_group, minimum_nights), y = minimum_nights)) +
geom_point(size = 3) +
theme_bw() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(colour = "grey60", linetype = "dashed"),
axis.text.x = element_text(angle = 60, hjust = 1)
)
# Cleveland plot - BOSTON
ggplot(boston_dataset, aes(x = price, y = neighbourhood_cleansed)) +
geom_segment(aes(yend = neighbourhood_cleansed), xend = 0, colour = "grey50") +
geom_point(size = 3, aes(colour = room_type)) +
scale_colour_brewer(palette = "Set1", limits = c("Private room", "Entire home/apt", "Shared room")) +
theme_bw() +
theme(
panel.grid.major.y = element_blank(),
legend.position = c(1, 0.55),
legend.justification = c(1, 0.5)
)
ggplot(boston_dataset, aes(x = number_of_reviews, y = neighbourhood_cleansed)) +
geom_segment(aes(yend = neighbourhood_cleansed), xend = 0, colour = "grey50") +
geom_point(size = 3, aes(colour = room_type)) +
scale_colour_brewer(palette = "Set1", limits = c("Private room", "Entire home/apt", "Shared room")) +
theme_bw() +
theme(
panel.grid.major.y = element_blank(),
legend.position = c(1, 0.55),
legend.justification = c(1, 0.5)
)
ggplot(boston_dataset, aes(x = reorder(neighbourhood_cleansed, minimum_nights), y = minimum_nights)) +
geom_point(size = 3) +
theme_bw() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(colour = "grey60", linetype = "dashed"),
axis.text.x = element_text(angle = 60, hjust = 1)
)
Below is shown an interactive plot similar to the last two:
fig <- ggplot(nyc_dataset ,aes(x=latitude, y=longitude, color=room_type)) +
geom_point(alpha = 0.2) + ggtitle('Map/room type Boston listing - Interactive')
scale_color_brewer(palette = "Dark2")
<ggproto object: Class ScaleDiscrete, Scale, gg>
aesthetics: colour
axis_order: function
break_info: function
break_positions: function
breaks: waiver
call: call
clone: function
dimension: function
drop: TRUE
expand: waiver
get_breaks: function
get_breaks_minor: function
get_labels: function
get_limits: function
guide: legend
is_discrete: function
is_empty: function
labels: waiver
limits: NULL
make_sec_title: function
make_title: function
map: function
map_df: function
n.breaks.cache: NULL
na.translate: TRUE
na.value: NA
name: waiver
palette: function
palette.cache: NULL
position: left
range: <ggproto object: Class RangeDiscrete, Range, gg>
range: NULL
reset: function
train: function
super: <ggproto object: Class RangeDiscrete, Range, gg>
reset: function
scale_name: brewer
train: function
train_df: function
transform: function
transform_df: function
super: <ggproto object: Class ScaleDiscrete, Scale, gg>
fig <- ggplotly(fig)
fig
fig <- ggplot(boston_dataset ,aes(x=latitude, y=longitude, color=room_type)) +
geom_point(alpha = 0.2) + ggtitle('Map/room type Boston listing - Interactive')
scale_color_brewer(palette = "Dark2")
<ggproto object: Class ScaleDiscrete, Scale, gg>
aesthetics: colour
axis_order: function
break_info: function
break_positions: function
breaks: waiver
call: call
clone: function
dimension: function
drop: TRUE
expand: waiver
get_breaks: function
get_breaks_minor: function
get_labels: function
get_limits: function
guide: legend
is_discrete: function
is_empty: function
labels: waiver
limits: NULL
make_sec_title: function
make_title: function
map: function
map_df: function
n.breaks.cache: NULL
na.translate: TRUE
na.value: NA
name: waiver
palette: function
palette.cache: NULL
position: left
range: <ggproto object: Class RangeDiscrete, Range, gg>
range: NULL
reset: function
train: function
super: <ggproto object: Class RangeDiscrete, Range, gg>
reset: function
scale_name: brewer
train: function
train_df: function
transform: function
transform_df: function
super: <ggproto object: Class ScaleDiscrete, Scale, gg>
fig <- ggplotly(fig)
fig
reviews_vs_price <- nyc_dataset[nyc_dataset$number_of_reviews < 100,]
reviews_vs_price <- nyc_dataset[nyc_dataset$price < 500,]
rev_vs_price <- ggplot(reviews_vs_price, aes(x=price, y=number_of_reviews, color=room_type)) +
geom_point(alpha=.3)
ggplotly(rev_vs_price)
# Violin Plot
price_nyc <- nyc_dataset[which(nyc_dataset$price < 300),]
violin <- ggplot(price_nyc, aes(x=neighbourhood_group, y=price, fill=neighbourhood_group)) +
geom_violin(trim=TRUE, adjust=0.8) +
stat_summary(geom="point", fun="median") +
ggtitle('Price per neighbourhood in NYC')
Ignoring unknown parameters: fun
ggplotly(violin)
No summary function supplied, defaulting to `mean_se()
Now let’s do some modeling. We are applying first Linear Regression on the NYC dataset in order to predict prices:
# Linear Regression Model
nyc_model = lm(price ~., data=nyc_dataset)
# The Model summary
summary(nyc_model)
Call:
lm(formula = price ~ ., data = nyc_dataset)
Residuals:
Min 1Q Median 3Q Max
-298.9 -53.7 -18.0 18.6 4537.7
Coefficients: (7 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -8.104e+04 4.793e+04 -1.691 0.090984 .
X 1.446e-02 2.670e-02 0.542 0.588125
id -5.057e-07 4.731e-05 -0.011 0.991472
host_id -5.473e-06 1.917e-06 -2.855 0.004336 **
neighbourhood_groupBrooklyn 8.497e+01 1.650e+02 0.515 0.606501
neighbourhood_groupManhattan 1.149e+02 1.308e+02 0.878 0.379771
neighbourhood_groupQueens 4.393e+01 1.241e+02 0.354 0.723484
neighbourhood_groupStaten Island 2.882e+01 3.401e+02 0.085 0.932474
neighbourhoodArrochar 7.960e+01 2.202e+02 0.362 0.717707
neighbourhoodArverne 1.523e+02 1.643e+02 0.927 0.354060
neighbourhoodAstoria -1.457e+01 6.873e+01 -0.212 0.832148
neighbourhoodBattery Park City 1.366e+02 1.392e+02 0.981 0.326491
neighbourhoodBay Ridge -1.367e+02 1.039e+02 -1.315 0.188626
neighbourhoodBayside 2.539e+02 1.995e+02 1.273 0.203161
neighbourhoodBedford-Stuyvesant -2.447e+01 5.496e+01 -0.445 0.656146
neighbourhoodBellerose 2.118e+02 2.057e+02 1.030 0.303228
neighbourhoodBensonhurst -6.962e+01 1.352e+02 -0.515 0.606687
neighbourhoodBoerum Hill 2.456e+01 7.362e+01 0.334 0.738692
neighbourhoodBorough Park -2.520e+01 1.335e+02 -0.189 0.850227
neighbourhoodBriarwood 7.555e+01 1.199e+02 0.630 0.528585
neighbourhoodBrighton Beach -2.396e+01 1.041e+02 -0.230 0.817942
neighbourhoodBrooklyn Heights 1.536e+02 7.168e+01 2.143 0.032193 *
neighbourhoodBushwick -9.934e+00 6.392e+01 -0.155 0.876501
neighbourhoodCambria Heights 1.332e+02 1.691e+02 0.787 0.431139
neighbourhoodCanarsie 6.528e+01 9.584e+01 0.681 0.495827
neighbourhoodCarroll Gardens 5.427e+01 6.145e+01 0.883 0.377216
neighbourhoodChelsea -3.145e+01 2.717e+01 -1.158 0.247162
neighbourhoodChinatown -6.543e+01 4.068e+01 -1.609 0.107824
neighbourhoodCity Island 6.308e+01 1.965e+02 0.321 0.748206
neighbourhoodCivic Center -6.557e+00 1.040e+02 -0.063 0.949708
neighbourhoodClason Point 6.306e+01 1.950e+02 0.323 0.746394
neighbourhoodClifton -8.561e+01 2.694e+02 -0.318 0.750686
neighbourhoodClinton Hill 4.675e+00 5.622e+01 0.083 0.933734
neighbourhoodCobble Hill -2.686e+01 7.842e+01 -0.343 0.731959
neighbourhoodCollege Point 2.686e+02 1.902e+02 1.412 0.158059
neighbourhoodColumbia St -3.739e+01 1.144e+02 -0.327 0.743774
neighbourhoodConcord -5.762e+01 2.341e+02 -0.246 0.805597
neighbourhoodConcourse 2.513e+01 1.334e+02 0.188 0.850570
neighbourhoodConcourse Village 3.026e+00 1.953e+02 0.015 0.987639
neighbourhoodCrown Heights -2.548e+01 5.303e+01 -0.481 0.630846
neighbourhoodCypress Hills 5.282e+01 9.757e+01 0.541 0.588291
neighbourhoodDitmars Steinway 1.436e+01 8.013e+01 0.179 0.857779
neighbourhoodDowntown Brooklyn -3.035e+01 1.147e+02 -0.265 0.791357
neighbourhoodDUMBO 2.237e+01 9.662e+01 0.232 0.816899
neighbourhoodEast Elmhurst 1.179e+01 1.193e+02 0.099 0.921276
neighbourhoodEast Flatbush -2.779e+01 6.786e+01 -0.410 0.682197
neighbourhoodEast Harlem -1.008e+02 5.560e+01 -1.814 0.069847 .
neighbourhoodEast New York -1.506e+01 9.486e+01 -0.159 0.873882
neighbourhoodEast Village -4.141e+01 2.572e+01 -1.610 0.107571
neighbourhoodEastchester -2.189e+01 1.933e+02 -0.113 0.909838
neighbourhoodElmhurst -7.987e+00 1.010e+02 -0.079 0.937003
neighbourhoodEmerson Hill -8.543e+01 2.595e+02 -0.329 0.741997
neighbourhoodFieldston -1.378e+02 1.943e+02 -0.709 0.478369
neighbourhoodFinancial District -7.624e+00 5.311e+01 -0.144 0.885870
neighbourhoodFlatbush -2.964e+01 5.869e+01 -0.505 0.613628
neighbourhoodFlatiron District 9.435e+01 5.907e+01 1.597 0.110304
neighbourhoodFlatlands -1.171e+01 1.368e+02 -0.086 0.931814
neighbourhoodFlushing 8.571e+01 8.961e+01 0.957 0.338896
neighbourhoodForest Hills 1.067e+01 1.896e+02 0.056 0.955121
neighbourhoodFort Greene -2.996e+01 5.665e+01 -0.529 0.596902
neighbourhoodFort Hamilton -1.128e+02 1.054e+02 -1.070 0.284920
neighbourhoodGlendale 1.138e+01 1.412e+02 0.081 0.935755
neighbourhoodGowanus -1.286e+01 6.311e+01 -0.204 0.838525
neighbourhoodGramercy -5.395e+01 4.554e+01 -1.185 0.236277
neighbourhoodGraniteville -1.695e+02 2.598e+02 -0.652 0.514188
neighbourhoodGravesend -3.583e+01 1.366e+02 -0.262 0.793091
neighbourhoodGreenpoint 4.860e+00 6.930e+01 0.070 0.944092
neighbourhoodGreenwich Village -1.564e+01 3.477e+01 -0.450 0.652942
neighbourhoodHarlem -1.025e+02 6.018e+01 -1.704 0.088547 .
neighbourhoodHell's Kitchen -7.366e+01 3.303e+01 -2.230 0.025805 *
neighbourhoodHighbridge -3.017e+01 1.518e+02 -0.199 0.842462
neighbourhoodInwood -1.332e+02 9.812e+01 -1.357 0.174810
neighbourhoodJackson Heights 1.237e+01 9.516e+01 0.130 0.896569
neighbourhoodJamaica 1.515e+02 1.352e+02 1.120 0.262690
neighbourhoodKensington -2.716e+01 6.324e+01 -0.430 0.667582
neighbourhoodKew Gardens 5.995e+01 1.917e+02 0.313 0.754516
neighbourhoodKingsbridge -4.428e+01 1.303e+02 -0.340 0.733981
neighbourhoodKips Bay -9.400e+01 4.272e+01 -2.200 0.027862 *
neighbourhoodLittle Italy -2.563e+01 6.967e+01 -0.368 0.713008
neighbourhoodLong Island City 1.853e+01 7.342e+01 0.252 0.800734
neighbourhoodLongwood -4.297e+00 1.949e+02 -0.022 0.982415
neighbourhoodLower East Side -5.267e+01 3.114e+01 -1.692 0.090813 .
neighbourhoodMariners Harbor -1.411e+02 2.613e+02 -0.540 0.589365
neighbourhoodMaspeth 3.176e+01 1.194e+02 0.266 0.790246
neighbourhoodMiddle Village 6.672e+01 1.408e+02 0.474 0.635656
neighbourhoodMidtown -5.725e+01 3.726e+01 -1.536 0.124586
neighbourhoodMidwood 2.281e+01 1.138e+02 0.201 0.841094
neighbourhoodMorningside Heights -1.229e+02 6.773e+01 -1.814 0.069739 .
neighbourhoodMorris Heights -2.309e+01 1.939e+02 -0.119 0.905239
neighbourhoodMott Haven -4.482e+00 1.520e+02 -0.029 0.976478
neighbourhoodMount Eden -1.980e+01 1.497e+02 -0.132 0.894799
neighbourhoodMurray Hill -1.704e+01 6.059e+01 -0.281 0.778545
neighbourhoodNew Springville -9.403e+01 2.561e+02 -0.367 0.713555
neighbourhoodNoHo 5.928e+00 7.437e+01 0.080 0.936466
neighbourhoodNolita 1.214e+02 3.906e+01 3.109 0.001895 **
neighbourhoodOzone Park 4.945e+01 1.948e+02 0.254 0.799643
neighbourhoodPark Slope 1.058e+00 5.328e+01 0.020 0.984163
neighbourhoodPort Morris 1.764e+01 1.359e+02 0.130 0.896683
neighbourhoodPort Richmond 2.076e+01 2.327e+02 0.089 0.928922
neighbourhoodProspect-Lefferts Gardens -3.167e+01 5.555e+01 -0.570 0.568674
neighbourhoodProspect Heights 5.183e+01 5.483e+01 0.945 0.344620
neighbourhoodQueens Village 1.240e+02 1.454e+02 0.853 0.393780
neighbourhoodRed Hook -7.969e+01 1.142e+02 -0.698 0.485528
neighbourhoodRego Park 4.607e+01 1.217e+02 0.379 0.704990
neighbourhoodRichmond Hill 4.525e+01 1.490e+02 0.304 0.761410
neighbourhoodRidgewood 2.429e+01 8.198e+01 0.296 0.766998
neighbourhoodRockaway Beach 1.551e+02 2.183e+02 0.710 0.477558
neighbourhoodRoosevelt Island -7.461e+01 1.294e+02 -0.576 0.564385
neighbourhoodSheepshead Bay -6.444e+01 1.098e+02 -0.587 0.557250
neighbourhoodShore Acres -1.219e+02 2.393e+02 -0.509 0.610673
neighbourhoodSoHo -2.784e+00 3.758e+01 -0.074 0.940941
neighbourhoodSoundview 2.552e+01 1.944e+02 0.131 0.895571
neighbourhoodSouth Slope 7.910e+00 5.687e+01 0.139 0.889391
neighbourhoodSpuyten Duyvil -7.320e+01 1.946e+02 -0.376 0.706913
neighbourhoodSt. Albans 1.365e+02 1.225e+02 1.114 0.265214
neighbourhoodSt. George -6.132e+01 2.428e+02 -0.253 0.800637
neighbourhoodStapleton -5.073e+01 2.699e+02 -0.188 0.850938
neighbourhoodSunnyside -3.811e+00 7.533e+01 -0.051 0.959648
neighbourhoodSunset Park -8.019e+01 6.062e+01 -1.323 0.186021
neighbourhoodTheater District -2.693e+01 7.640e+01 -0.352 0.724494
neighbourhoodTompkinsville -8.481e+01 2.176e+02 -0.390 0.696693
neighbourhoodTottenville NA NA NA NA
neighbourhoodTribeca 1.023e+02 5.708e+01 1.792 0.073208 .
neighbourhoodTwo Bridges -7.174e+01 9.107e+01 -0.788 0.430880
neighbourhoodUniversity Heights -2.305e+01 1.934e+02 -0.119 0.905159
neighbourhoodUpper East Side -6.696e+01 4.245e+01 -1.577 0.114817
neighbourhoodUpper West Side -2.061e+01 4.230e+01 -0.487 0.626119
neighbourhoodVinegar Hill -3.753e+01 1.360e+02 -0.276 0.782648
neighbourhoodWakefield -9.704e+00 1.936e+02 -0.050 0.960030
neighbourhoodWashington Heights -1.409e+02 7.976e+01 -1.767 0.077389 .
neighbourhoodWest Village NA NA NA NA
neighbourhoodWilliamsbridge -3.833e+00 1.931e+02 -0.020 0.984165
neighbourhoodWilliamsburg -8.518e+00 6.088e+01 -0.140 0.888736
neighbourhoodWindsor Terrace NA NA NA NA
neighbourhoodWoodlawn -5.704e+01 1.935e+02 -0.295 0.768208
neighbourhoodWoodside NA NA NA NA
latitude 4.978e+02 6.244e+02 0.797 0.425443
longitude -8.231e+02 5.186e+02 -1.587 0.112620
room_typePrivate room -8.930e+01 7.089e+00 -12.597 < 2e-16 ***
room_typeShared room -9.954e+01 3.022e+01 -3.294 0.001001 **
minimum_nights -3.024e-01 1.179e-01 -2.564 0.010392 *
number_of_reviews -1.494e-01 4.141e-02 -3.608 0.000314 ***
calculated_host_listings_count -6.041e-01 6.950e-01 -0.869 0.384816
availability_365 1.513e-01 2.546e-02 5.944 3.12e-09 ***
room_type_cat NA NA NA NA
neighbourhood_cat NA NA NA NA
neighbourhood_group_cat NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 175.4 on 2860 degrees of freedom
Multiple R-squared: 0.1589, Adjusted R-squared: 0.1181
F-statistic: 3.889 on 139 and 2860 DF, p-value: < 2.2e-16
# Round the summary
coeffs <- summary(nyc_model)$coefficients
coeffs <- round(coeffs, 4)
coeffs
Estimate Std. Error t value Pr(>|t|)
(Intercept) -81037.7043 47928.5597 -1.6908 0.0910
X 0.0145 0.0267 0.5416 0.5881
id 0.0000 0.0000 -0.0107 0.9915
host_id 0.0000 0.0000 -2.8549 0.0043
neighbourhood_groupBrooklyn 84.9733 164.9546 0.5151 0.6065
neighbourhood_groupManhattan 114.8587 130.7509 0.8785 0.3798
neighbourhood_groupQueens 43.9262 124.1411 0.3538 0.7235
neighbourhood_groupStaten Island 28.8169 340.0631 0.0847 0.9325
neighbourhoodArrochar 79.5986 220.1528 0.3616 0.7177
neighbourhoodArverne 152.3181 164.3320 0.9269 0.3541
neighbourhoodAstoria -14.5694 68.7342 -0.2120 0.8321
neighbourhoodBattery Park City 136.6015 139.1939 0.9814 0.3265
neighbourhoodBay Ridge -136.6811 103.9425 -1.3150 0.1886
neighbourhoodBayside 253.9017 199.4691 1.2729 0.2032
neighbourhoodBedford-Stuyvesant -24.4715 54.9566 -0.4453 0.6561
neighbourhoodBellerose 211.8294 205.7155 1.0297 0.3032
neighbourhoodBensonhurst -69.6163 135.2126 -0.5149 0.6067
neighbourhoodBoerum Hill 24.5605 73.6185 0.3336 0.7387
neighbourhoodBorough Park -25.2031 133.4584 -0.1888 0.8502
neighbourhoodBriarwood 75.5490 119.8726 0.6302 0.5286
neighbourhoodBrighton Beach -23.9557 104.0587 -0.2302 0.8179
neighbourhoodBrooklyn Heights 153.6113 71.6786 2.1431 0.0322
neighbourhoodBushwick -9.9344 63.9195 -0.1554 0.8765
neighbourhoodCambria Heights 133.1607 169.1241 0.7874 0.4311
neighbourhoodCanarsie 65.2807 95.8379 0.6812 0.4958
neighbourhoodCarroll Gardens 54.2744 61.4537 0.8832 0.3772
neighbourhoodChelsea -31.4548 27.1746 -1.1575 0.2472
neighbourhoodChinatown -65.4296 40.6761 -1.6086 0.1078
neighbourhoodCity Island 63.0786 196.4843 0.3210 0.7482
neighbourhoodCivic Center -6.5574 103.9551 -0.0631 0.9497
neighbourhoodClason Point 63.0587 194.9694 0.3234 0.7464
neighbourhoodClifton -85.6126 269.4210 -0.3178 0.7507
neighbourhoodClinton Hill 4.6748 56.2179 0.0832 0.9337
neighbourhoodCobble Hill -26.8643 78.4237 -0.3426 0.7320
neighbourhoodCollege Point 268.5508 190.1920 1.4120 0.1581
neighbourhoodColumbia St -37.3942 114.3934 -0.3269 0.7438
neighbourhoodConcord -57.6151 234.0811 -0.2461 0.8056
neighbourhoodConcourse 25.1290 133.3753 0.1884 0.8506
neighbourhoodConcourse Village 3.0262 195.3064 0.0155 0.9876
neighbourhoodCrown Heights -25.4834 53.0254 -0.4806 0.6308
neighbourhoodCypress Hills 52.8208 97.5679 0.5414 0.5883
neighbourhoodDitmars Steinway 14.3613 80.1330 0.1792 0.8578
neighbourhoodDowntown Brooklyn -30.3467 114.7004 -0.2646 0.7914
neighbourhoodDUMBO 22.3724 96.6174 0.2316 0.8169
neighbourhoodEast Elmhurst 11.7865 119.2530 0.0988 0.9213
neighbourhoodEast Flatbush -27.7874 67.8555 -0.4095 0.6822
neighbourhoodEast Harlem -100.8311 55.5978 -1.8136 0.0698
neighbourhoodEast New York -15.0590 94.8635 -0.1587 0.8739
neighbourhoodEast Village -41.4068 25.7231 -1.6097 0.1076
neighbourhoodEastchester -21.8909 193.2912 -0.1133 0.9098
neighbourhoodElmhurst -7.9868 101.0429 -0.0790 0.9370
neighbourhoodEmerson Hill -85.4251 259.4596 -0.3292 0.7420
neighbourhoodFieldston -137.7619 194.2986 -0.7090 0.4784
neighbourhoodFinancial District -7.6240 53.1122 -0.1435 0.8859
neighbourhoodFlatbush -29.6354 58.6890 -0.5050 0.6136
neighbourhoodFlatiron District 94.3498 59.0674 1.5973 0.1103
neighbourhoodFlatlands -11.7083 136.8280 -0.0856 0.9318
neighbourhoodFlushing 85.7110 89.6081 0.9565 0.3389
neighbourhoodForest Hills 10.6730 189.6333 0.0563 0.9551
neighbourhoodFort Greene -29.9633 56.6500 -0.5289 0.5969
neighbourhoodFort Hamilton -112.7787 105.4467 -1.0695 0.2849
neighbourhoodGlendale 11.3825 141.1980 0.0806 0.9358
neighbourhoodGowanus -12.8613 63.1076 -0.2038 0.8385
neighbourhoodGramercy -53.9461 45.5399 -1.1846 0.2363
neighbourhoodGraniteville -169.4701 259.7595 -0.6524 0.5142
neighbourhoodGravesend -35.8265 136.5737 -0.2623 0.7931
neighbourhoodGreenpoint 4.8604 69.3016 0.0701 0.9441
neighbourhoodGreenwich Village -15.6368 34.7696 -0.4497 0.6529
neighbourhoodHarlem -102.5286 60.1802 -1.7037 0.0885
neighbourhoodHell's Kitchen -73.6608 33.0274 -2.2303 0.0258
neighbourhoodHighbridge -30.1726 151.8015 -0.1988 0.8425
neighbourhoodInwood -133.1676 98.1161 -1.3572 0.1748
neighbourhoodJackson Heights 12.3720 95.1634 0.1300 0.8966
neighbourhoodJamaica 151.5045 135.2385 1.1203 0.2627
neighbourhoodKensington -27.1636 63.2428 -0.4295 0.6676
neighbourhoodKew Gardens 59.9544 191.7194 0.3127 0.7545
neighbourhoodKingsbridge -44.2772 130.2777 -0.3399 0.7340
neighbourhoodKips Bay -94.0041 42.7223 -2.2003 0.0279
neighbourhoodLittle Italy -25.6284 69.6697 -0.3679 0.7130
neighbourhoodLong Island City 18.5320 73.4171 0.2524 0.8007
neighbourhoodLongwood -4.2969 194.9257 -0.0220 0.9824
neighbourhoodLower East Side -52.6729 31.1361 -1.6917 0.0908
neighbourhoodMariners Harbor -141.0688 261.3272 -0.5398 0.5894
neighbourhoodMaspeth 31.7618 119.3978 0.2660 0.7902
neighbourhoodMiddle Village 66.7155 140.7999 0.4738 0.6357
neighbourhoodMidtown -57.2454 37.2629 -1.5363 0.1246
neighbourhoodMidwood 22.8109 113.7631 0.2005 0.8411
neighbourhoodMorningside Heights -122.8724 67.7250 -1.8143 0.0697
neighbourhoodMorris Heights -23.0879 193.9242 -0.1191 0.9052
neighbourhoodMott Haven -4.4820 152.0012 -0.0295 0.9765
neighbourhoodMount Eden -19.7984 149.7097 -0.1322 0.8948
neighbourhoodMurray Hill -17.0410 60.5918 -0.2812 0.7785
neighbourhoodNew Springville -94.0288 256.1239 -0.3671 0.7136
neighbourhoodNoHo 5.9285 74.3662 0.0797 0.9365
neighbourhoodNolita 121.4445 39.0600 3.1092 0.0019
neighbourhoodOzone Park 49.4487 194.8088 0.2538 0.7996
neighbourhoodPark Slope 1.0578 53.2826 0.0199 0.9842
neighbourhoodPort Morris 17.6444 135.8691 0.1299 0.8967
neighbourhoodPort Richmond 20.7590 232.7010 0.0892 0.9289
neighbourhoodProspect-Lefferts Gardens -31.6668 55.5489 -0.5701 0.5687
neighbourhoodProspect Heights 51.8296 54.8326 0.9452 0.3446
neighbourhoodQueens Village 124.0316 145.4216 0.8529 0.3938
neighbourhoodRed Hook -79.6884 114.2436 -0.6975 0.4855
neighbourhoodRego Park 46.0698 121.6747 0.3786 0.7050
neighbourhoodRichmond Hill 45.2480 149.0096 0.3037 0.7614
neighbourhoodRidgewood 24.2935 81.9808 0.2963 0.7670
neighbourhoodRockaway Beach 155.0815 218.3234 0.7103 0.4776
neighbourhoodRoosevelt Island -74.6101 129.4397 -0.5764 0.5644
neighbourhoodSheepshead Bay -64.4416 109.7814 -0.5870 0.5573
neighbourhoodShore Acres -121.8644 239.3398 -0.5092 0.6107
neighbourhoodSoHo -2.7842 37.5761 -0.0741 0.9409
neighbourhoodSoundview 25.5161 194.3790 0.1313 0.8956
neighbourhoodSouth Slope 7.9103 56.8732 0.1391 0.8894
neighbourhoodSpuyten Duyvil -73.1967 194.6498 -0.3760 0.7069
neighbourhoodSt. Albans 136.5205 122.5092 1.1144 0.2652
neighbourhoodSt. George -61.3215 242.8125 -0.2525 0.8006
neighbourhoodStapleton -50.7280 269.9175 -0.1879 0.8509
neighbourhoodSunnyside -3.8115 75.3252 -0.0506 0.9596
neighbourhoodSunset Park -80.1902 60.6234 -1.3228 0.1860
neighbourhoodTheater District -26.9319 76.4037 -0.3525 0.7245
neighbourhoodTompkinsville -84.8085 217.5542 -0.3898 0.6967
neighbourhoodTribeca 102.2976 57.0796 1.7922 0.0732
neighbourhoodTwo Bridges -71.7430 91.0678 -0.7878 0.4309
neighbourhoodUniversity Heights -23.0509 193.4491 -0.1192 0.9052
neighbourhoodUpper East Side -66.9583 42.4488 -1.5774 0.1148
neighbourhoodUpper West Side -20.6127 42.3042 -0.4872 0.6261
neighbourhoodVinegar Hill -37.5331 136.0409 -0.2759 0.7826
neighbourhoodWakefield -9.7036 193.6080 -0.0501 0.9600
neighbourhoodWashington Heights -140.9051 79.7571 -1.7667 0.0774
neighbourhoodWilliamsbridge -3.8327 193.0895 -0.0198 0.9842
neighbourhoodWilliamsburg -8.5181 60.8800 -0.1399 0.8887
neighbourhoodWoodlawn -57.0422 193.5292 -0.2947 0.7682
latitude 497.7551 624.4359 0.7971 0.4254
longitude -823.0831 518.6357 -1.5870 0.1126
room_typePrivate room -89.2965 7.0889 -12.5967 0.0000
room_typeShared room -99.5421 30.2219 -3.2937 0.0010
minimum_nights -0.3024 0.1179 -2.5642 0.0104
number_of_reviews -0.1494 0.0414 -3.6080 0.0003
calculated_host_listings_count -0.6041 0.6950 -0.8692 0.3848
availability_365 0.1513 0.0255 5.9436 0.0000
# Linear model for Boston
boston_model = lm(price ~., data=boston_dataset_numerical)
# The Model summary
summary(boston_model)
Call:
lm(formula = price ~ ., data = boston_dataset_numerical)
Residuals:
Min 1Q Median 3Q Max
-233.58 -23.51 0.91 21.47 1030.87
Coefficients: (5 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.281e+03 2.100e+04 0.299 0.764948
X 5.784e-03 8.765e-03 0.660 0.509365
bathrooms 2.952e+01 3.463e+00 8.526 < 2e-16 ***
bedrooms 2.166e+01 2.731e+00 7.931 3.28e-15 ***
beds 6.802e+00 2.026e+00 3.357 0.000801 ***
cleaning_fee 1.739e-01 3.521e-02 4.938 8.44e-07 ***
availability_365 1.682e-02 1.067e-02 1.578 0.114797
number_of_reviews -2.366e-02 4.514e-02 -0.524 0.600149
latitude -2.618e+01 2.349e+02 -0.111 0.911266
longitude 7.353e+01 2.230e+02 0.330 0.741601
minimum_nights 5.115e-01 1.608e-01 3.180 0.001490 **
maximum_nights -2.008e-07 6.668e-07 -0.301 0.763288
property_type_Apartment -2.321e+01 6.647e+01 -0.349 0.726989
property_type_Aprtament 2.377e+01 9.386e+01 0.253 0.800098
property_type_Bed...Breakfast -1.664e+01 6.803e+01 -0.245 0.806780
property_type_Boat -3.819e+01 7.002e+01 -0.545 0.585504
property_type_Camper.RV -2.317e+01 9.372e+01 -0.247 0.804788
property_type_Condominium -2.350e+01 6.667e+01 -0.353 0.724478
property_type_Dorm -1.728e+01 9.393e+01 -0.184 0.854091
property_type_Entire.Floor -4.393e+01 8.175e+01 -0.537 0.591056
property_type_Guesthouse -1.183e+01 9.496e+01 -0.125 0.900887
property_type_House -2.205e+01 6.664e+01 -0.331 0.740804
property_type_Loft -1.739e+01 6.746e+01 -0.258 0.796573
property_type_Other NA NA NA NA
property_type_Townhouse 1.484e-01 6.747e+01 0.002 0.998245
property_type_Villa NA NA NA NA
room_type_Entire.home.apt 2.396e+01 1.320e+01 1.815 0.069573 .
room_type_Private.room 9.178e+00 1.300e+01 0.706 0.480339
room_type_Shared.room NA NA NA NA
neighbourhood_cleansed_Allston 3.977e+01 2.259e+01 1.761 0.078412 .
neighbourhood_cleansed_Back.Bay 8.511e+01 2.655e+01 3.205 0.001367 **
neighbourhood_cleansed_Bay.Village 9.707e+01 3.466e+01 2.801 0.005137 **
neighbourhood_cleansed_Beacon.Hill 9.064e+01 2.777e+01 3.264 0.001114 **
neighbourhood_cleansed_Brighton 4.242e+01 2.124e+01 1.997 0.045925 *
neighbourhood_cleansed_Charlestown 6.838e+01 3.088e+01 2.215 0.026882 *
neighbourhood_cleansed_Chinatown 7.129e+01 3.286e+01 2.169 0.030155 *
neighbourhood_cleansed_Dorchester 2.199e+01 2.517e+01 0.874 0.382396
neighbourhood_cleansed_Downtown 6.419e+01 2.799e+01 2.293 0.021929 *
neighbourhood_cleansed_East.Boston 4.173e+01 3.525e+01 1.184 0.236562
neighbourhood_cleansed_Fenway 6.568e+01 2.237e+01 2.936 0.003354 **
neighbourhood_cleansed_Hyde.Park 3.388e+01 2.583e+01 1.312 0.189702
neighbourhood_cleansed_Jamaica.Plain 5.327e+01 2.771e+01 1.922 0.054666 .
neighbourhood_cleansed_Leather.District 1.089e+02 5.005e+01 2.177 0.029589 *
neighbourhood_cleansed_Longwood.Medical.Area 3.845e+01 3.429e+01 1.121 0.262269
neighbourhood_cleansed_Mattapan 3.544e+01 3.077e+01 1.152 0.249513
neighbourhood_cleansed_Mission.Hill 5.123e+01 2.820e+01 1.817 0.069395 .
neighbourhood_cleansed_North.End 6.478e+01 3.396e+01 1.907 0.056596 .
neighbourhood_cleansed_Roslindale 4.693e+01 2.936e+01 1.599 0.110045
neighbourhood_cleansed_Roxbury 5.699e+01 2.722e+01 2.093 0.036410 *
neighbourhood_cleansed_South.Boston 4.298e+01 2.911e+01 1.477 0.139862
neighbourhood_cleansed_South.Boston.Waterfront 7.980e+01 3.034e+01 2.631 0.008576 **
neighbourhood_cleansed_South.End 6.968e+01 2.783e+01 2.504 0.012362 *
neighbourhood_cleansed_West.End 5.177e+01 2.981e+01 1.736 0.082641 .
neighbourhood_cleansed_West.Roxbury NA NA NA NA
bed_type_Airbed -1.387e+01 1.312e+01 -1.058 0.290318
bed_type_Couch -4.688e+01 3.139e+01 -1.494 0.135427
bed_type_Futon -1.317e+00 1.179e+01 -0.112 0.911093
bed_type_Pull.out.Sofa -9.604e-01 1.701e+01 -0.056 0.954968
bed_type_Real.Bed NA NA NA NA
labels 6.127e+01 1.839e+00 33.322 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 66.08 on 2423 degrees of freedom
(1107 observations deleted due to missingness)
Multiple R-squared: 0.702, Adjusted R-squared: 0.6953
F-statistic: 105.7 on 54 and 2423 DF, p-value: < 2.2e-16
# Round the summary - Boston
coeffs <- summary(boston_model)$coefficients
coeffs <- round(coeffs, 4)
coeffs
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6280.6920 21004.0248 0.2990 0.7649
X 0.0058 0.0088 0.6599 0.5094
bathrooms 29.5212 3.4626 8.5256 0.0000
bedrooms 21.6584 2.7307 7.9314 0.0000
beds 6.8024 2.0264 3.3568 0.0008
cleaning_fee 0.1739 0.0352 4.9377 0.0000
availability_365 0.0168 0.0107 1.5776 0.1148
number_of_reviews -0.0237 0.0451 -0.5243 0.6001
latitude -26.1775 234.8745 -0.1115 0.9113
longitude 73.5328 222.9811 0.3298 0.7416
minimum_nights 0.5115 0.1608 3.1803 0.0015
maximum_nights 0.0000 0.0000 -0.3012 0.7633
property_type_Apartment -23.2097 66.4705 -0.3492 0.7270
property_type_Aprtament 23.7693 93.8576 0.2532 0.8001
property_type_Bed...Breakfast -16.6418 68.0345 -0.2446 0.8068
property_type_Boat -38.1902 70.0179 -0.5454 0.5855
property_type_Camper.RV -23.1667 93.7229 -0.2472 0.8048
property_type_Condominium -23.5031 66.6713 -0.3525 0.7245
property_type_Dorm -17.2750 93.9265 -0.1839 0.8541
property_type_Entire.Floor -43.9313 81.7512 -0.5374 0.5911
property_type_Guesthouse -11.8273 94.9573 -0.1246 0.9009
property_type_House -22.0476 66.6441 -0.3308 0.7408
property_type_Loft -17.3927 67.4626 -0.2578 0.7966
property_type_Townhouse 0.1484 67.4696 0.0022 0.9982
room_type_Entire.home.apt 23.9594 13.1972 1.8155 0.0696
room_type_Private.room 9.1777 13.0021 0.7059 0.4803
neighbourhood_cleansed_Allston 39.7739 22.5895 1.7607 0.0784
neighbourhood_cleansed_Back.Bay 85.1120 26.5531 3.2053 0.0014
neighbourhood_cleansed_Bay.Village 97.0710 34.6577 2.8008 0.0051
neighbourhood_cleansed_Beacon.Hill 90.6365 27.7693 3.2639 0.0011
neighbourhood_cleansed_Brighton 42.4234 21.2423 1.9971 0.0459
neighbourhood_cleansed_Charlestown 68.3762 30.8756 2.2146 0.0269
neighbourhood_cleansed_Chinatown 71.2853 32.8606 2.1693 0.0302
neighbourhood_cleansed_Dorchester 21.9896 25.1698 0.8736 0.3824
neighbourhood_cleansed_Downtown 64.1944 27.9950 2.2931 0.0219
neighbourhood_cleansed_East.Boston 41.7314 35.2487 1.1839 0.2366
neighbourhood_cleansed_Fenway 65.6759 22.3674 2.9362 0.0034
neighbourhood_cleansed_Hyde.Park 33.8784 25.8253 1.3118 0.1897
neighbourhood_cleansed_Jamaica.Plain 53.2684 27.7085 1.9225 0.0547
neighbourhood_cleansed_Leather.District 108.9491 50.0492 2.1768 0.0296
neighbourhood_cleansed_Longwood.Medical.Area 38.4462 34.2870 1.1213 0.2623
neighbourhood_cleansed_Mattapan 35.4407 30.7696 1.1518 0.2495
neighbourhood_cleansed_Mission.Hill 51.2285 28.1995 1.8166 0.0694
neighbourhood_cleansed_North.End 64.7807 33.9641 1.9073 0.0566
neighbourhood_cleansed_Roslindale 46.9329 29.3592 1.5986 0.1100
neighbourhood_cleansed_Roxbury 56.9924 27.2237 2.0935 0.0364
neighbourhood_cleansed_South.Boston 42.9844 29.1066 1.4768 0.1399
neighbourhood_cleansed_South.Boston.Waterfront 79.8033 30.3357 2.6307 0.0086
neighbourhood_cleansed_South.End 69.6776 27.8318 2.5035 0.0124
neighbourhood_cleansed_West.End 51.7658 29.8142 1.7363 0.0826
bed_type_Airbed -13.8729 13.1166 -1.0577 0.2903
bed_type_Couch -46.8792 31.3881 -1.4935 0.1354
bed_type_Futon -1.3165 11.7894 -0.1117 0.9111
bed_type_Pull.out.Sofa -0.9604 17.0064 -0.0565 0.9550
labels 61.2722 1.8388 33.3219 0.0000
# RANDOM FOREST - NYC
# Split the data into training and test
index = sample(2,nrow(nyc_dataset),replace = TRUE,prob=c(0.3,0.7))
# The training data
train_nyc = nyc_dataset[index==1,]
# The testing data
test_nyc = nyc_dataset[index==2,]
# The model
rfm = randomForest(price ~ latitude + longitude + neighbourhood_group_cat + minimum_nights + availability_365, data = train_nyc)
# Predictions
price_preds = predict(rfm, test_nyc[c("latitude", "longitude", "neighbourhood_group_cat", "minimum_nights", "availability_365")])
test_nyc$price_preds = price_preds
diff <- test_nyc[c("price", "price_preds")]
diff
ggplot(diff, aes(x=price, y=price_preds, color=)) +
geom_point(colour = "grey60") +
stat_smooth(method = lm, level = 0.99, colour = "red")
# RANDOM FOREST - Boston
# Split the data into training and test
index = sample(2,nrow(boston_dataset_numerical),replace = TRUE,prob=c(0.3,0.7))
# The training data
train_boston = boston_dataset_numerical[index==1,]
# The testing data
test_boston = boston_dataset_numerical[index==2,]
# The model
train_boston <- na.omit(train_boston)
test_boston <- na.omit(test_boston)
rfm = randomForest(price ~ ., data = train_boston)
# Predictions
price_preds = predict(rfm, test_boston)
test_boston$price_preds = price_preds
diff <- test_boston[c("price", "price_preds")]
diff
ggplot(diff, aes(x=price, y=price_preds, color=)) +
geom_point(colour = "grey60") +
stat_smooth(method = lm, level = 0.99, colour = "red")